library(rvest)
## Loading required package: xml2
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
install.packages('ggmap',repos="http://ftp.iitm.ac.in/cran")
##
## The downloaded binary packages are in
## /var/folders/_q/5wf9q71n3gj3n1srx3hh68qc0000gn/T//Rtmpa3CvUT/downloaded_packages
library(ggmap)
Getting data from following link:
Dataset was downloaded as a CSV file prior to the analysis.
getwd()
## [1] "/Volumes/MYDISK/GreatLake/Day3_Assignments/Oct_8_2017"
mydata1=read.csv("calHosInpMortality.csv", header =TRUE)
names(mydata1)
## [1] "Year" "County"
## [3] "Hospital" "OSHPDID"
## [5] "Procedure.Condition" "Risk.Adjusted.Mortality.Rate"
## [7] "X..of.Deaths" "X..of.Cases"
## [9] "Hospital.Ratings" "Longitude"
## [11] "Latitude"
attach(mydata1)
str(mydata1)
## 'data.frame': 22270 obs. of 11 variables:
## $ Year : int 2012 2012 2012 2012 2012 2012 2012 2012 2012 2012 ...
## $ County : Factor w/ 56 levels "AAAA","Alameda",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Hospital : Factor w/ 465 levels "Adventist Medical Center",..: 414 414 414 414 414 414 414 414 414 414 ...
## $ OSHPDID : int NA NA NA NA NA NA NA NA NA NA ...
## $ Procedure.Condition : Factor w/ 18 levels "AAA Repair","AAA Repair Unruptured",..: 17 18 11 15 7 10 5 4 13 16 ...
## $ Risk.Adjusted.Mortality.Rate: Factor w/ 501 levels "",".","0","0.2",..: 132 1 128 135 422 391 387 167 1 131 ...
## $ X..of.Deaths : Factor w/ 133 levels "",".","0","1",..: 9 1 10 40 57 40 38 47 1 74 ...
## $ X..of.Cases : Factor w/ 761 levels "",".","1","10",..: 398 1 486 705 468 374 401 17 1 118 ...
## $ Hospital.Ratings : Factor w/ 3 levels "As Expected",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Longitude : Factor w/ 342 levels "","-114.5956",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Latitude : Factor w/ 342 levels "",".","32.61909",..: 1 1 1 1 1 1 1 1 1 1 ...
## Modify column names for easier operation.
column_names <- c('year','county','hospitalName','hospitalId','procedureName','riskRate','numDeath','numCases','hospRating','long','lat')
colnames(mydata1) <- column_names
tail(mydata1,1)
## year county hospitalName hospitalId
## 22270 2015 Yuba Rideout Memorial Hospital 106580996
## procedureName riskRate numDeath numCases hospRating
## 22270 AAA Repair Unruptured 0 0 10 As Expected
## long lat
## 22270 -121.594363 39.138222
unique(mydata1$county)
## [1] AAAA Alameda Amador Butte
## [5] Calaveras Colusa Contra Costa Del Norte
## [9] El Dorado Fresno Glenn Humboldt
## [13] Imperial Inyo Kern Kings
## [17] Lake Lassen Los Angeles Madera
## [21] Marin Mariposa Mendocino Merced
## [25] Modoc Mono Monterey Napa
## [29] Nevada Orange Placer Plumas
## [33] Riverside Sacramento San Benito San Bernardino
## [37] San Diego San Francisco San Joaquin San Luis Obispo
## [41] San Mateo Santa Barbara Santa Clara Solano
## [45] Sonoma Santa Cruz Shasta Siskiyou
## [49] Stanislaus Trinity Tehama Tulare
## [53] Tuolumne Ventura Yolo Yuba
## 56 Levels: AAAA Alameda Amador Butte Calaveras Colusa ... Yuba
str(mydata1)
## 'data.frame': 22270 obs. of 11 variables:
## $ year : int 2012 2012 2012 2012 2012 2012 2012 2012 2012 2012 ...
## $ county : Factor w/ 56 levels "AAAA","Alameda",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ hospitalName : Factor w/ 465 levels "Adventist Medical Center",..: 414 414 414 414 414 414 414 414 414 414 ...
## $ hospitalId : int NA NA NA NA NA NA NA NA NA NA ...
## $ procedureName: Factor w/ 18 levels "AAA Repair","AAA Repair Unruptured",..: 17 18 11 15 7 10 5 4 13 16 ...
## $ riskRate : Factor w/ 501 levels "",".","0","0.2",..: 132 1 128 135 422 391 387 167 1 131 ...
## $ numDeath : Factor w/ 133 levels "",".","0","1",..: 9 1 10 40 57 40 38 47 1 74 ...
## $ numCases : Factor w/ 761 levels "",".","1","10",..: 398 1 486 705 468 374 401 17 1 118 ...
## $ hospRating : Factor w/ 3 levels "As Expected",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ long : Factor w/ 342 levels "","-114.5956",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ lat : Factor w/ 342 levels "",".","32.61909",..: 1 1 1 1 1 1 1 1 1 1 ...
dim(mydata1)
## [1] 22270 11
dim(mydata1 %>% filter(is.na(county)))
## [1] 0 11
mydata1$county <- as.character(mydata1$county)
mydata1$hospitalName <- as.character(mydata1$hospitalName)
mydata1$hospitalId <- as.character(mydata1$hospitalId)
mydata1$procedureName <- as.character(mydata1$procedureName)
mydata1$riskRate <- as.numeric(mydata1$riskRate)
mydata1$numDeath <- as.numeric(mydata1$numDeath)
mydata1$numCases <- as.numeric(mydata1$numCases)
mydata1$hospRating <- as.character(mydata1$hospRating)
mydata1$long <- as.numeric(mydata1$long)
mydata1$lat <- as.numeric(mydata1$lat)
str(mydata1)
## 'data.frame': 22270 obs. of 11 variables:
## $ year : int 2012 2012 2012 2012 2012 2012 2012 2012 2012 2012 ...
## $ county : chr "AAAA" "AAAA" "AAAA" "AAAA" ...
## $ hospitalName : chr "STATEWIDE" "STATEWIDE" "STATEWIDE" "STATEWIDE" ...
## $ hospitalId : chr NA NA NA NA ...
## $ procedureName: chr "PCI" "Pneumonia" "GI Hemorrhage" "Pancreatic Other" ...
## $ riskRate : num 132 1 128 135 422 391 387 167 1 131 ...
## $ numDeath : num 9 1 10 40 57 40 38 47 1 74 ...
## $ numCases : num 398 1 486 705 468 374 401 17 1 118 ...
## $ hospRating : chr "As Expected" "As Expected" "As Expected" "As Expected" ...
## $ long : num 1 1 1 1 1 1 1 1 1 1 ...
## $ lat : num 1 1 1 1 1 1 1 1 1 1 ...
unique(mydata1$hospRating)
## [1] "As Expected" "Better" "Worse"
## Let's perform NA analysis
dim(mydata1 %>% filter(is.na(county)))
## [1] 0 11
dim(mydata1 %>% filter(is.na(hospitalName)))
## [1] 0 11
dim(mydata1 %>% filter(is.na(hospitalId)))
## [1] 68 11
dim(mydata1 %>% filter(is.na(procedureName)))
## [1] 0 11
dim(mydata1 %>% filter(is.na(riskRate)))
## [1] 0 11
dim(mydata1 %>% filter(is.na(numDeath)))
## [1] 0 11
dim(mydata1 %>% filter(is.na(numCases)))
## [1] 0 11
dim(mydata1 %>% filter(is.na(hospRating)))
## [1] 0 11
dim(mydata1 %>% filter(is.na(long)))
## [1] 0 11
dim(mydata1 %>% filter(is.na(lat)))
## [1] 0 11
## It is evident from the above excersise that there are 68 records for which hospital id is NULL.
## Let's find out the details about those record where hospital ID is NULL.
dNullID <- mydata1 %>% filter(is.na(hospitalId))
## it is observed all 68 records belongs to hospital "STATEWIDE", It is also observed that this hospital
## belongs to AAAA county. Let's assign a hospitalID (OSPDID)
## retrieve county code from all the OSPDID, to find the pattern of county code. As per the data dictionary, "OSPDID" is a unique number established by the Office of Statewide Health Planning and Development (OSHPD) for identifying facilities and used in the Licensed Facility Information System (LFIS). The first 3 numbers identify the type of facility, the next two represent the county number, and the last five are randomly assigned wihin each county.
unique(substr(mydata1$hospitalId,4,5))
## [1] NA "01" "03" "04" "05" "06" "07" "08" "09" "10" "11" "12" "13" "14"
## [15] "15" "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27" "28"
## [29] "29" "30" "31" "32" "33" "34" "35" "36" "37" "38" "39" "40" "41" "42"
## [43] "43" "48" "49" "44" "45" "47" "50" "53" "52" "54" "55" "56" "57" "58"
## it is clear that range of county code starts from 01, ends at 58. So, we can assign country code as 00 to this county. Also We noticed that first 3 digit is always 106. Hence our derived hospital id for this particular hospital "STATEWIDE" is 1060012345.
##
mydata1 <- mydata1 %>%
mutate (hospitalId = ifelse(hospitalName == "STATEWIDE", "1060012345", hospitalId))
str(mydata1)
## 'data.frame': 22270 obs. of 11 variables:
## $ year : int 2012 2012 2012 2012 2012 2012 2012 2012 2012 2012 ...
## $ county : chr "AAAA" "AAAA" "AAAA" "AAAA" ...
## $ hospitalName : chr "STATEWIDE" "STATEWIDE" "STATEWIDE" "STATEWIDE" ...
## $ hospitalId : chr "1060012345" "1060012345" "1060012345" "1060012345" ...
## $ procedureName: chr "PCI" "Pneumonia" "GI Hemorrhage" "Pancreatic Other" ...
## $ riskRate : num 132 1 128 135 422 391 387 167 1 131 ...
## $ numDeath : num 9 1 10 40 57 40 38 47 1 74 ...
## $ numCases : num 398 1 486 705 468 374 401 17 1 118 ...
## $ hospRating : chr "As Expected" "As Expected" "As Expected" "As Expected" ...
## $ long : num 1 1 1 1 1 1 1 1 1 1 ...
## $ lat : num 1 1 1 1 1 1 1 1 1 1 ...
## We can see now county code is added for "AAAA" county as well.
unique(substr(mydata1$hospitalId,4,5))
## [1] "00" "01" "03" "04" "05" "06" "07" "08" "09" "10" "11" "12" "13" "14"
## [15] "15" "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27" "28"
## [29] "29" "30" "31" "32" "33" "34" "35" "36" "37" "38" "39" "40" "41" "42"
## [43] "43" "48" "49" "44" "45" "47" "50" "53" "52" "54" "55" "56" "57" "58"
## Year-wise number of death
dfYearDeath <- mydata1 %>%
select(year,numDeath) %>%
group_by(year) %>%
dplyr::summarise(TotalYearDeath = sum(numDeath)) %>%
arrange(desc(TotalYearDeath))
## Show countywise death.
dfCounty <- mydata1 %>%
select(county,numDeath) %>%
group_by(county) %>%
dplyr::summarise(TotalCountyDeath = sum(numDeath)) %>%
arrange(desc(TotalCountyDeath))
## plot county-wise death.
ggplot(dfCounty)+ aes(reorder(county,TotalCountyDeath), TotalCountyDeath, fill = county )+
geom_col(width = 1) +
coord_flip()
## It is evident that, overall, "Los Angeles" county is maximum contributor to the death toll.
## plot year-wise death.
ggplot(dfYearDeath) + aes(year, TotalYearDeath) + geom_line()
## from the plot, it is evident that , maximum number of death occured in year 2014
## Show county wise death toll for the year 2014
df2014 <- mydata1 %>% filter(year==2014)
dfCountyDeath <- df2014 %>%
select(county,numDeath) %>%
group_by(county) %>%
dplyr::summarise(TotalCountyDeath = sum(numDeath)) %>%
arrange(desc(TotalCountyDeath))
## plot county-wise death for 2014.
ggplot(dfCountyDeath)+ aes(reorder(county,TotalCountyDeath), TotalCountyDeath, fill = county )+
geom_col(width = 1) +
coord_flip()
## It is evident that for year 2014 also, "Los Angeles" county is maximum contributor to the death toll.
## Show the county wise highest death toll
df2014_deathTollMax <- df2014 %>% group_by(county) %>% dplyr::summarise(numDeath=max(numDeath))
ggplot(df2014_deathTollMax)+
aes(reorder(county,numDeath),weight=numDeath)+
geom_bar() +
coord_flip()
## Hospital-wise number of death for Los Angeles county
dfHospDeath <- mydata1 %>%
filter(county=="Los Angeles")%>%
select(hospitalName,numDeath) %>%
group_by(hospitalName) %>%
dplyr::summarise(TotalHospDeath = sum(numDeath)) %>%
arrange(desc(TotalHospDeath))
## Plot the graph
ggplot(dfHospDeath)+
aes(reorder(hospitalName,TotalHospDeath),weight=TotalHospDeath)+
geom_bar() +
coord_flip()
## Lakewood Regional Medical Center, which belongs to Los Angeles county, is leading the list of death toll.
## findout the procedure impacting maximum.
dfLR <- mydata1 %>%
filter(hospitalName == "Lakewood Regional Medical Center")%>%
select(procedureName,numCases,numDeath) %>%
group_by(procedureName) %>%
dplyr::summarise(procCases = sum(numCases),procDeath=sum(numDeath))
## How many patients are alive against each of the procedure
dfLR$alive <- (dfLR$procCases - dfLR$procDeath)
## Remove negative entires, number of cases cannot be less that number of death,hence those are bad data hence removed.
dfLR <- dfLR %>%
filter(alive >= 0)
## find successrate against each procedure
dfLR$percentSuccess <- round((dfLR$alive/dfLR$procCases)*100,digits=2)
dfLR$percentFailure <- round(100.00-dfLR$percentSuccess,digits=2)
## Plot the procedures showing least successful in this hospital
ggplot(dfLR)+
aes(reorder(procedureName,percentFailure),weight=percentFailure)+
geom_bar() +
coord_flip()
## The diagram shows that this hospital has least success rate against Pancreatic Resection, Pancreatic Cancer, Espophageal Resection and other Pancreatic procedures.
## Plot the procedure causing maximum inpatient death over the whole period 2012-2015
dfprocWise <- mydata1 %>%
# filter(year == 2014) %>%
group_by(procedureName) %>%
dplyr::summarise(InpatientDeathCount = sum(numDeath)) %>%
arrange(desc(InpatientDeathCount))
ggplot(dfprocWise)+ aes(reorder(procedureName,InpatientDeathCount), InpatientDeathCount, fill = procedureName )+
geom_col(width = 1) +
coord_flip()
## Plot the procedure causing maximum inpatient death in 2014
dfprocWise2014 <- mydata1 %>%
filter(year == 2014) %>%
group_by(procedureName) %>%
dplyr::summarise(InpatientDeathCount = sum(numDeath)) %>%
arrange(desc(InpatientDeathCount))
ggplot(dfprocWise2014)+ aes(reorder(procedureName,InpatientDeathCount), InpatientDeathCount, fill = procedureName )+
geom_col(width = 1) +
coord_flip()
#Let's plot the hospitals as per their coordinates, yearwise. It shows a good overlap with very few outliners on 2012.
ggplot() + geom_point(data=mydata1,aes(x=long,y=lat,color=year))
# MODEL
options(repr.plot.width=10, repr.plot.height=6)
str(mydata1)
## 'data.frame': 22270 obs. of 11 variables:
## $ year : int 2012 2012 2012 2012 2012 2012 2012 2012 2012 2012 ...
## $ county : chr "AAAA" "AAAA" "AAAA" "AAAA" ...
## $ hospitalName : chr "STATEWIDE" "STATEWIDE" "STATEWIDE" "STATEWIDE" ...
## $ hospitalId : chr "1060012345" "1060012345" "1060012345" "1060012345" ...
## $ procedureName: chr "PCI" "Pneumonia" "GI Hemorrhage" "Pancreatic Other" ...
## $ riskRate : num 132 1 128 135 422 391 387 167 1 131 ...
## $ numDeath : num 9 1 10 40 57 40 38 47 1 74 ...
## $ numCases : num 398 1 486 705 468 374 401 17 1 118 ...
## $ hospRating : chr "As Expected" "As Expected" "As Expected" "As Expected" ...
## $ long : num 1 1 1 1 1 1 1 1 1 1 ...
## $ lat : num 1 1 1 1 1 1 1 1 1 1 ...
## How the risk rate and percentage failure to avoid death are related to each other.
unique(mydata1$procedureName)
## [1] "PCI" "Pneumonia"
## [3] "GI Hemorrhage" "Pancreatic Other"
## [5] "AMI" "Espophageal Resection"
## [7] "Acute Stroke Ischemic" "Acute Stroke Hemorrhagic"
## [9] "Hip Fracture" "Pancreatic Resection"
## [11] "Acute Stroke" "Acute Stroke Subarachnoid"
## [13] "Heart Failure" "Carotid Endarterectomy"
## [15] "Craniotomy" "AAA Repair"
## [17] "Pancreatic Cancer" "AAA Repair Unruptured"
mydata <- mydata1
mydata1$procedureName <- as.factor(mydata1$procedureName)
mydata1$alive <- (mydata1$numCases - mydata1$numDeath)
## Remove negative entires, number of cases cannot be less that number of death,hence those are bad data hence removed.
mydata1<- mydata1 %>%
filter(alive >= 0)
mydata1$percentSuccess <- round((mydata1$alive/mydata1$numCases)*100,digits=2)
mydata1$percentFailure <- round(100.00-mydata1$percentSuccess,digits=2)
ggplot(mydata1) + aes(percentFailure,riskRate) + geom_point()
cor(mydata1$percentFailure, mydata1$riskRate)
## [1] -0.4396215
cor(log(mydata1$percentFailure),log(mydata1$riskRate))
## [1] -0.36639
## PRINCIPLE: Visualizing linear relationships
#We can try and fit a linear line to the data to see if there is a relationship
ggplot(mydata1) + aes( riskRate ,percentFailure)+ geom_point() + stat_smooth(method = 'lm')
# Get the value for Los Angeles, as LA is the highest contributor to death number.
dfLA <- mydata1 %>%
filter( county == "Los Angeles")
#filter( year == 2014) %>%
#arrange(year)
dfLAMutate <- dfLA %>%
mutate(riskRateLog = log(riskRate))
ggplot(dfLAMutate) + aes(riskRate) + geom_histogram(bins = 30)
ggplot(dfLAMutate) + aes(riskRateLog) + geom_histogram(bins = 30)
dfLAMutate <- dfLAMutate %>%
mutate(riskRateLogLag = lag(riskRateLog)) %>%
mutate(riskRateLogDiff = riskRateLog - lag(riskRateLogLag) )
# We can see that they are slightghly correlated
cor(dfLAMutate$riskRateLog, dfLAMutate$riskRateLogLag, use = 'complete')
## [1] 0.1828183
ggplot(dfLAMutate) + aes(riskRateLog, riskRateLogLag) + geom_point()
## Warning: Removed 1 rows containing missing values (geom_point).
summary(dfLA$riskRate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 1.00 3.00 93.07 133.00 500.00
summary(dfLA$percentSuccess)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 68.13 50.96 95.90 99.60
cor(dfLA$riskRate, dfLA$percentSuccess)
## [1] 0.3967339
cor(log(dfLA$riskRate), log(dfLA$numDeath))
## [1] 0.8177901
## It is observed thatRiskRate and number of Death at Los Angeles county are well corelated.
ggplot(dfLA) + aes(log(riskRate), log(numDeath)) + geom_point()
## It is observed that log of number of death increases with log of Risrate of the procedure.
ggplot(dfLA) + aes(log(riskRate), log(numDeath)) + geom_point()+ stat_smooth(method = 'lm')
## Try to find relation between hospital coordinates
cor(mydata1$long,mydata1$lat)
## [1] 0.8663495
## Seems to be very well co-related. Plot them as geom points
ggplot(mydata1) + aes(long, lat) + geom_point()
## Try to find linear regression and fit a line among points
ggplot(mydata1) + aes( long ,lat)+ geom_point() + stat_smooth(method = 'lm')
#It is observed that Year 2014 recorded highest number of death. Also, Los Angeles county recorded maximum number of inpatient deaths. At the same time it is observed that with increased rate of risk, number of deaths inceased.